The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
You as a Data scientist at Thera bank need to come up with a classification model that will help the bank improve its services so that customers do not renounce their credit cards
You need to identify the best possible model that will give the required performance
# library to suppress warnings or deprecation notes
import warnings
warnings.filterwarnings("ignore")
# libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# library to split data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# remove the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# set the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
from sklearn import metrics
# library to build Linear Regression Model
from sklearn.linear_model import LogisticRegression
# library to encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# libraries to build ensemble models
from sklearn.ensemble import (
BaggingClassifier,
RandomForestClassifier,
AdaBoostClassifier,
GradientBoostingClassifier,
)
# libraries to build xgboost model
from xgboost import XGBClassifier
# library for stacking classifier
from sklearn.ensemble import StackingClassifier
# to tune different models
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
# to get diferent metric scores
from sklearn.metrics import (
recall_score,
confusion_matrix,
)
# for oversampling and undersampling data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# for missing value imputation
from sklearn.impute import SimpleImputer
#CSV file using BankChurners.csv
df = './MyDrive/BankChurners.csv'
# Mount it to the Google Drive
from google.colab import drive
drive.mount('/content/drive')
#Reading the CSV file using BankChurners.csv
df = pd.read_csv('/content/drive/MyDrive/BankChurners.csv')
# print the data set information as number of rows and columns
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.") # f-string
# check the dataset information
df.info()
Observations:
# setting the random seed via np.random.seed to get the same random results every time
np.random.seed(1)
# also look at random 10 sample rows
df.sample(n=10)
Observations:
# number of missing values in columns
df.isna().sum()
# precentage of missing values in columns
round(df.isna().sum() / df.isna().count() * 100, 2)
Observations:
# let"s check for duplicate values in the data
df.duplicated().sum()
Observations:
There are no duplicated values.
# let"s view the statistical summary of the numerical columns in the data
df.describe().T
Observations:
# create numerical columns list
num_cols = df.select_dtypes(include=["int64", "float64"])
# create categorical columns list
cat_cols = df.select_dtypes(exclude=["int64", "float64"])
#get the valuecounts
for i in cat_cols:
print(df[i].value_counts())
print("-"*50)
print("\n")
Observations:
Education_Level, Income_Category and Card_Category can be checked for influence on Marital_Status
df[df["Marital_Status"].isna()]["Card_Category"].value_counts()
df[df["Card_Category"] == "Blue"]["Marital_Status"].value_counts()
df[df["Marital_Status"].isna()]["Income_Category"].value_counts()
df[df["Income_Category"] == "Less than $40K"]["Marital_Status"].value_counts()
df[df["Marital_Status"].isna()]["Education_Level"].value_counts()
Observations:
# use simple imputer to impute the most_frequent values in categorical column
mar_imputer = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
df.Marital_Status = mar_imputer.fit_transform(df["Marital_Status"].values.reshape(-1,1))[:,0]
df["Marital_Status"].value_counts()
Observations:
NULL Values are treated.
Income_Category can be checked for influence on Education_Level
df[df["Education_Level"].isna()]["Income_Category"].value_counts()
df[df["Income_Category"] == "Less than $40K"]["Education_Level"].value_counts()
Observations:
# use simple imputer to impute the most_frequent values in categorical column
edu_imputer = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
df.Education_Level = edu_imputer.fit_transform(df["Education_Level"].values.reshape(-1,1))[:,0]
df["Education_Level"].value_counts()
Observations:
NULL Values are treated.
# precentage of missing values in columns
round(df.isna().sum() / df.isna().count() * 100, 2)
Observations:
We see that all null values are now treated
# check value_counts
df.Income_Category.value_counts()
# check Eduation_Level values for income_category abc
df[df["Income_Category"] == "abc"]["Education_Level"].value_counts()
Observations:
# treating error
ic_imputer = SimpleImputer(missing_values="abc", strategy="most_frequent")
df.Income_Category = ic_imputer.fit_transform(df["Income_Category"].values.reshape(-1,1))[:,0]
# verify the update
df.Income_Category.value_counts()
Observations:
Income_Category "abc" is now treated and imputed with most_frequent value "Less than $40k"
# values before encoding
df.Attrition_Flag.value_counts()
# instantiate the LabelEncoder
encode_attr = LabelEncoder()
# fit the encoder
encode_attr.fit(df["Attrition_Flag"])
# encode the variable
df["Attrition_Flag"] = encode_attr.transform(df["Attrition_Flag"])
# verify the update
df.Attrition_Flag.value_counts()
Observations:
The target columne is now encoded with Existing Customers as 1 and Attrited Customers as 0.
# histogram and boxplot for the feature
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="orange"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="Winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(data[feature].median(), color="blue", linestyle="-")
plt.show() # show the plot
# labeled_barplot
def labeled_barplot(data, feature, perc=False, v_ticks=True, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
if v_ticks is True:
plt.xticks(rotation=90)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"CLIENTNUM")
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Attrition_Flag", True, False)
Observations:
Target variable shows 83.9% customers are existing and 16.1% are attrited.
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Customer_Age")
df.loc[df["Customer_Age"] > 70]
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Gender", True, False)
Observations:
Number of Female customers is higher than Male customers.
# use label_barplot function to plot the graph
labeled_barplot(df,"Dependent_count", True, False)
Observations:
Most customers have 3 dependents followed by customers with 2 and 1 dependents.
# use label_barplot function to plot the graph
labeled_barplot(df,"Education_Level", True, True)
Observations:
Most customers are Graduate followed by High School.
# use label_barplot function to plot the graph
labeled_barplot(df,"Marital_Status", True, False)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Income_Category", True, True)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Card_Category", True, False)
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Months_on_book")
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Total_Relationship_Count", True, False)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Months_Inactive_12_mon", True, False)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Contacts_Count_12_mon", True, False)
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Credit_Limit")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Revolving_Bal")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Avg_Open_To_Buy")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Amt_Chng_Q4_Q1")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Trans_Amt")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Trans_Ct")
df.loc[df["Total_Trans_Ct"] > 130]
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Ct_Chng_Q4_Q1")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Avg_Utilization_Ratio")
Observations:
# for all numerical variables draw box plots
plt.figure(figsize=(20,30))
for i, variable in enumerate(num_cols):
plt.subplot(5,3,i+1)
sns.boxplot(df["Attrition_Flag"],df[variable],palette="Set1")
plt.tight_layout()
plt.title(variable)
plt.show()
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Gender", "Attrition_Flag")
Observations:
Number of Male Customers closing the Credit Card account is slighlty lesser than Female Customers.
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Education_Level", "Attrition_Flag")
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Marital_Status", "Attrition_Flag")
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Income_Category", "Attrition_Flag")
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Card_Category", "Attrition_Flag")
Observations:
Platinum card members are most likely to close the credit card compared to other categories.
# heatmap for correlation
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Observations:
# heatmap for correlation
sns.pairplot(df,hue="Attrition_Flag")
Observations:
As we have seen during bivariate analysis there are no clear indicators of attrition as all variables show similar data for both existing customers and attrited customers.
# check the outliers before treating
plt.figure(figsize=(20, 30))
for i, variable in enumerate(num_cols):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# find the 25th percentile and 75th percentile.
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
# Inter Quantile Range (75th percentile - 25th percentile)
IQR = Q3 - Q1
# find lower and upper bounds for all values. All values outside these bounds are outliers
lower=Q1-1.5*IQR
upper=Q3+1.5*IQR
((num_cols<lower)|(num_cols>upper)).sum()/len(df)*100
Observations:
# to plot confusion matrix
def draw_matrix(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
# calculate different metric scores of the model - Accuracy, Recall, F1 and Precision
def get_metrics_score(model,flag=True):
"""
model : classifier to predict values of X
"""
# defining an empty list to store results
score_list=[]
accuracy_list=[]
precision_list=[]
recall_list=[]
f1_list=[]
# predicting on train and tests
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)
pred_test = model.predict(X_test)
# accuracy of the model
train_acc = model.score(X_train,y_train)
val_acc = model.score(X_val,y_val)
test_acc = model.score(X_test,y_test)
# recall of the model
train_recall = metrics.recall_score(y_train,pred_train)
val_recall = metrics.recall_score(y_val,pred_val)
test_recall = metrics.recall_score(y_test,pred_test)
# precision of the model
train_precision = metrics.precision_score(y_train,pred_train)
val_precision = metrics.precision_score(y_val,pred_val)
test_precision = metrics.precision_score(y_test,pred_test)
# f1_score of the model
train_f1 = metrics.f1_score(y_train,pred_train)
val_f1 = metrics.f1_score(y_val,pred_val)
test_f1 = metrics.f1_score(y_test,pred_test)
# populate the score_list
score_list.extend((train_acc,val_acc,test_acc,train_recall,val_recall,test_recall,train_precision,
val_precision,test_precision,train_f1,val_f1,test_f1))
#list per data set
accuracy_list.extend((train_acc,val_acc,test_acc))
recall_list.extend((train_recall,val_recall,test_recall))
precision_list.extend((train_precision,val_precision,test_precision))
f1_list.extend((train_f1,val_f1,test_f1))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",train_acc)
print("Accuracy on validation set : ",val_acc)
print("Accuracy on test set : ",test_acc)
print("Recall on training set : ",train_recall)
print("Recall on validation set : ",val_recall)
print("Recall on test set : ",test_recall)
print("Precision on training set : ",train_precision)
print("Precision on validation set : ",val_precision)
print("Precision on test set : ",test_precision)
print("F1 on training set : ",train_f1)
print("F1 on validation set : ",val_f1)
print("F1 on test set : ",test_f1)
model_df = pd.DataFrame({"DataSet":["Training", "Validation", "Testing"],
"Accuracy": accuracy_list, "Recall": recall_list,
"Precision": precision_list,"F1": f1_list})
return model_df, score_list # returning the list with train and test scores
# Separating target variable and other variables
#X = df.drop(["CLIENTNUM","Attrition_Flag","Total_Ct_Chng_Q4_Q1","Total_Amt_Chng_Q4_Q1","Avg_Utilization_Ratio"], axis=1)
#X = df.drop(["CLIENTNUM","Attrition_Flag"], axis=1)
X = df.drop(["Attrition_Flag"], axis=1)
y = df["Attrition_Flag"]
X = pd.get_dummies(data=X, drop_first=True)
Training Set to have 60% data and Validation and Testing sets to have 20% data each
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
# use SMOTE - Synthetic Minority Over Sampling Technique to create oversampled training sets
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
)
# fit the sampler and created undersampled data
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
# use randomundersampler for dataset processing
rus = RandomUnderSampler(random_state=1)
# fit the sampler and created undersampled data
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
# Empty list to store all the models
models = []
# Appending models into the list
models.append(("Logistic regression", LogisticRegression(random_state=1)))
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("Gradient Boosting", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("Decision tree", DecisionTreeClassifier(random_state=1)))
# Empty list to store all model"s CV scores
results = []
# best_scores
best_scores = []
# Empty list to store name of the models
names = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance on Training Set:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
best_scores.append(round(cv_result.mean() * 100,2))
print("\n" "Training Set Performance:" "\n")
# fit and predict the models training set
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
print("\n" "Validation Set Performance:" "\n")
# predict the models on validation set
for name, model in models:
scores = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {}".format(name, scores))
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(15, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Observations:
# Empty list to store all model"s CV scores
results_over = []
# best_scores
best_scores_over = []
# Empty list to store name of the models
names_over = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance on Oversampled Training Set:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results_over.append(cv_result)
names_over.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
best_scores_over.append(round(cv_result.mean() * 100,2))
print("\n" "Oversampled Training Performance:" "\n")
# fit and predict the models training set
for name, model in models:
model.fit(X_train_over, y_train_over)
scores_over = recall_score(y_train_over, model.predict(X_train_over)) * 100
print("{}: {}".format(name, scores_over))
print("\n" "Validation Set Performance:" "\n")
# predict the models on validation set
for name, model in models:
scores = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {}".format(name, scores))
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(15, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_over)
ax.set_xticklabels(names_over)
plt.show()
Observations:
# Empty list to store all model"s CV scores
results_under = []
# best scores
best_scores_under = []
# Empty list to store name of the models
names_under = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance on Undersampled Training set:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_under, y=y_train_under, scoring=scoring, cv=kfold
)
results_under.append(cv_result)
names_under.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
best_scores_under.append(round(cv_result.mean() * 100,2))
print("\n" "Undersampled Training Performance:" "\n")
# fit and predict the models training set
for name, model in models:
model.fit(X_train_under, y_train_under)
scores_under = recall_score(y_train_under, model.predict(X_train_under)) * 100
print("{}: {}".format(name, scores_under))
print("\n" "Validation Set Performance:" "\n")
# predict the models on validation set
for name, model in models:
scores = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {}".format(name, scores))
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(15, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_under)
ax.set_xticklabels(names_under)
plt.show()
Observations:
# create dataframe
comparison_frame = pd.DataFrame({"Model":["Logistic regression","Bagging","Random forest","Gradient Boosting","Adaboost",
"Xgboost","Decision tree"],
"Training Set": best_scores, "Oversampled Training": best_scores_over,
"Undersampled Training": best_scores_under})
comparison_frame
Observations:
# random forest classifier
rfc = RandomForestClassifier(random_state=1)
# type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# grid of parameters to choose from
param_grid_rfc = {"n_estimators": np.arange(50,150,50),
"min_samples_leaf": np.arange(1,6,1),
"max_features":["log2",0.7,0.9,"auto"],
"max_samples": np.arange(0.3, 0.7, None),
"max_depth":np.arange(1,5,1),
}
# Calling RandomizedSearchCV
rfc_tuned2 = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid_rfc, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
# Fitting parameters in RandomizedSearchCV
rfc_tuned2.fit(X_train, y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(rfc_tuned2.best_params_,rfc_tuned2.best_score_))
# building model with best parameters
rfc_tuned_rcv = RandomForestClassifier(
random_state=1,
max_features=0.9,
max_samples=0.3,
min_samples_leaf=5,
n_estimators=100,
max_depth=1,
)
# Fit the model on training data
rfc_tuned_rcv.fit(X_train, y_train)
# calculating different metrics
rfc_tuned_rcv_score, rfc_tuned_rcv_list = get_metrics_score(
rfc_tuned_rcv, False
)
# model performance
rfc_tuned_rcv_score
# use draw_matrix function
draw_matrix(rfc_tuned_rcv, X_val, y_val)
Observations:
#defining model
xgb = XGBClassifier(random_state=1,eval_metric="logloss")
# Parameter grid
param_grid_xgb ={"n_estimators":np.arange(50,150,50),
"scale_pos_weight":[2,5,10],
"learning_rate":[0.01,0.1,0.2,0.05],
"gamma":[0,1,3,5],
"subsample":[0.8,0.9,1],
"max_depth":np.arange(1,5,1),
"reg_lambda":[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_xgb, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
# Fitting parameters
xgb_tuned2.fit(X_train,y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
# building model with best parameters
xgb_tuned_rcv = XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=2,
subsample=0.8,
learning_rate=0.01,
gamma=5,
eval_metric="logloss",
reg_lambda=10,
max_depth=1,
)
# Fit the model on training data
xgb_tuned_rcv.fit(X_train, y_train)
# calculating different metrics
xgb_tuned_rcv_score, xgb_tuned_rcv_list = get_metrics_score(
xgb_tuned_rcv, False
)
# model performance
xgb_tuned_rcv_score
# use draw_matrix function
draw_matrix(xgb_tuned_rcv, X_val, y_val)
Observations:
# Choose the type of classifier.
gbc = GradientBoostingClassifier(random_state=1)
# Parameter grid
param_grid_gbc ={"n_estimators":np.arange(50,150,50),
"learning_rate":[0.01,0.1,0.2,0.05],
"subsample":[0.8,0.9,1],
"max_depth":np.arange(1,5,1),
"min_samples_leaf": np.arange(1,6,1),
"max_features":["log2",0.7,0.9,"auto"],
"max_depth":np.arange(1,5,1)}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling RandomizedSearchCV
gbc_tuned2 = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_gbc, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
# Fitting parameters
gbc_tuned2.fit(X_train,y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(gbc_tuned2.best_params_,gbc_tuned2.best_score_))
# Choose the type of classifier.
gbc_tuned_rcv = GradientBoostingClassifier (
random_state=1,
n_estimators=50,
learning_rate=0.01,
subsample=0.8,
max_depth=1,
min_samples_leaf=4,
max_features=0.7,
)
# Fit the model on training data
gbc_tuned_rcv.fit(X_train, y_train)
# calculating different metrics
gbc_tuned_rcv_score, gbc_tuned_rcv_list = get_metrics_score(
gbc_tuned_rcv, False
)
# model performance
gbc_tuned_rcv_score
# use draw_matrix function
draw_matrix(gbc_tuned_rcv, X_val, y_val)
Observations:
# create dataframe
comparison_frame1 = pd.DataFrame({"Model":["Accuracy - Training",
"Accuracy - Validation",
"Accuracy - Test",
"Recall - Training",
"Recall - Validation",
"Recall - Test",
"Precision - Training",
"Precision - Validation",
"Precision - Test",
"F1 - Training",
"F1 - Validation",
"F1 - Test"],
"Random Forest - Randomized Search": rfc_tuned_rcv_list,
"XG Boost - Randomized Search": xgb_tuned_rcv_list,
"Gradient Boost - Randomized Search": gbc_tuned_rcv_list}
)
comparison_frame1
Observations:
feature_names = X.columns
importances = xgb_tuned_rcv.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observations:
Total_Trans_Ct is the most important variable, followed by Total_Ct_Chng_Q4_Q1 and Total_Revolving_Bal.
Now, we have a final model. let's use pipelines to put the model into production
# creating a list of numerical variables
numerical_features = [
"CLIENTNUM",
"Customer_Age",
"Dependent_count",
"Months_on_book",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Trans_Ct",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio",
]
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = [
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
# creating a transformer for categorical variables, which will first apply simple imputer and
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = df.drop(columns="Attrition_Flag")
Y = df["Attrition_Flag"]
We already have the model to be tuned, so we will need only Training and Testing Sets.
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("PRE", preprocessor),
(
"XGB",
XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=2,
subsample=0.8,
learning_rate=0.01,
gamma=5,
eval_metric="logloss",
reg_lambda=10,
max_depth=1,
),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)
model.predict(X_test)